This analyses was conducted in 2020-02, and was updated on 2020-08-01 to use the new results files as input and to create an html report.
library(speedyseq)
library(tidyverse)
library(here)
library(cowplot)
library(ggridges)
library(ggbeeswarm)
library(ggforce)
theme_set(theme_minimal_grid(12))
# import::from(metacal, mutate_by, close_elts)
# Dev version of metacal for pairwise_ratios function
devtools::load_all("~/research/r-packages/metacal", helpers = FALSE)
my_psmelt <- function(physeq) {
speedyseq::psmelt(physeq) %>%
as_tibble %>%
rename(taxon = OTU, sample = Sample, abundance = Abundance)
}
as_tibble.phyloseq <- function(x) {
x %>% my_psmelt
}
as_tibble.sample_data <- function(x) {
x %>% as("data.frame") %>% as_tibble(rownames = "sample")
}
as_tibble.taxonomyTable <- function(x) {
x %>% as("matrix") %>% as_tibble(rownames = "taxon")
}
as_tibble.XStringSet <- function(x) {
x %>% as.character %>% enframe("taxon", "sequence")
}
Load phyloseq object,
ps <- readRDS(here("results/a1/a1-phyloseq-silva-v138.Rds")) %>%
print
#> phyloseq-class experiment-level object
#> otu_table() OTU Table: [ 367 taxa and 96 samples ]
#> sample_data() Sample Data: [ 96 samples by 22 sample variables ]
#> tax_table() Taxonomy Table: [ 367 taxa by 7 taxonomic ranks ]
#> refseq() DNAStringSet: [ 367 reference sequences ]
sample_data(ps) <- sample_data(ps) %>%
transform(extraction_batch = as.factor(extraction_batch))
pstb <- ps %>% as_tibble
tax <- tax_table(ps) %>% as_tibble
ref <- refseq(ps) %>% as_tibble
sam <- sample_data(ps) %>% as_tibble
Get custom taxonomic assignments based on exact matching to 16S references,
custom_species <- dada2::assignSpecies(
ref$sequence,
here("strain-info", "inoculum-tmob-zymo-16s.fasta"),
verbose = TRUE
) %>%
as_tibble(rownames = "sequence") %>%
rename_all(str_to_lower) %>%
left_join(ref, by = "sequence")
#> 22 out of 367 were assigned to the species level.
custom_species %>% filter(!is.na(genus)) %>% print(n=Inf)
#> # A tibble: 22 x 4
#> sequence genus species taxon
#> <chr> <chr> <chr> <chr>
#> 1 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides ovatus A1_AS…
#> 2 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGC… Escherichia/S… coli A1_AS…
#> 3 TACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGT… Akkermansia muciniphila A1_AS…
#> 4 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides uniformis A1_AS…
#> 5 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides thetaiotao… A1_AS…
#> 6 TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGT… Staphylococcus aureus A1_AS…
#> 7 TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGT… Clostridium symbiosum A1_AS…
#> 8 TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGC… Roseburia intestinal… A1_AS…
#> 9 TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGT… Bacteroides caccae A1_AS…
#> 10 TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGGGCGCGT… Thioflavicocc… mobilis A1_AS…
#> 11 AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGC… Faecalibacter… prausnitzii A1_AS…
#> 12 TACGTAGGGGGCGAGCGTTATCCGGATTCATTGGGCGTAAAGCGCGCGT… Collinsella aerofaciens A1_AS…
#> 13 TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGT… Marvinbryantia formatexig… A1_AS…
#> 14 TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGC… Eubacterium rectale A1_AS…
#> 15 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGT… Barnesiella intestinih… A1_AS…
#> 16 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGT… Barnesiella intestinih… A1_AS…
#> 17 TACGTAGGTGGCAAGCGTTATCCGGATTTATTGGGCGTAAAGAGAGTGC… Lactobacillus fermentum A1_AS…
#> 18 TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTCGC… Bacillus subtilis A1_AS…
#> 19 TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGC… Salmonella enterica A1_AS…
#> 20 TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGCGCGC… Listeria monocytoge… A1_AS…
#> 21 TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC… Enterococcus faecalis A1_AS…
#> 22 TACGAAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGT… Pseudomonas aeruginosa A1_AS…
qplot(taxa_sums(ps)) +
scale_x_log10()
ps0 <- ps %>%
filter_taxa2(~sum(.) > 1000)
Find ASVs that seem to come from the same genome,
# correlations
asv_cor <- ps0 %>%
transform_sample_counts(~ . / sum(.)) %>%
otu_table %>%
cor
# genetic distances
dna <- refseq(ps0)
nproc <- 4
aln <- DECIPHER::AlignSeqs(dna, processors = nproc, verbose = FALSE)
d <- DECIPHER::DistanceMatrix(aln, processors = nproc, verbose = FALSE)
# tbl to view distances and correlations
dist_tb <- list(cor = asv_cor, dist = d) %>%
map(as_tibble, rownames = "taxon1") %>%
map_dfr(pivot_longer, -taxon1, names_to = "taxon2", .id = "type") %>%
pivot_wider(names_from = type, values_from = value) %>%
mutate_at(paste0("taxon", 1:2), factor, ordered = TRUE,
levels = paste0("A1_ASV", seq(ntaxa(ps0)))) %>%
filter(taxon1 < taxon2)
ggplot(dist_tb, aes(dist, cor)) +
geom_point() +
scale_x_log10()
dist_tb %>%
filter(dist < 0.01, cor > 0.95)
#> # A tibble: 5 x 4
#> taxon1 taxon2 cor dist
#> <ord> <ord> <dbl> <dbl>
#> 1 A1_ASV1 A1_ASV8 0.989 0.00791
#> 2 A1_ASV9 A1_ASV17 0.997 0.00395
#> 3 A1_ASV13 A1_ASV26 0.997 0.00395
#> 4 A1_ASV19 A1_ASV20 0.996 0.00395
#> 5 A1_ASV23 A1_ASV29 0.985 0.00395
Merge these pairs by repeated calls to merge_taxa().
to_merge <- dist_tb %>%
filter(dist < 0.01, cor > 0.95) %>%
# select(taxon1, taxon2) %>%
mutate_at(paste0("taxon", 1:2), as.character) %>%
mutate(pair = map2(taxon1, taxon2, c)) %>%
pull(pair)
ps1 <- purrr::reduce(
to_merge,
.init = ps0,
.f = merge_taxa
)
TODO: Look into the two pairs with small distances but low (negative) correlations.
Check identification
tax_table(ps0) %>% as_tibble %>%
filter(genus == "Bacteroides") %>%
select(taxon, genus, species)
#> # A tibble: 5 x 3
#> taxon genus species
#> <chr> <chr> <chr>
#> 1 A1_ASV1 Bacteroides fragilis/ovatus
#> 2 A1_ASV4 Bacteroides uniformis
#> 3 A1_ASV5 Bacteroides faecichinchillae/faecis/finegoldii/thetaiotaomicron
#> 4 A1_ASV8 Bacteroides caecimuris/fragilis/ovatus/xylanisolvens
#> 5 A1_ASV10 Bacteroides caccae
So we can distinguish all the Bacteroides.
top_taxa_sums <- ps1 %>%
subset_samples(specimen_type == "inoculum") %>%
taxa_sums %>%
sort(decreasing = TRUE) %>%
head(25)
top_taxa_sums
#> A1_ASV2 A1_ASV6 A1_ASV9 A1_ASV12 A1_ASV7 A1_ASV13 A1_ASV3 A1_ASV4 A1_ASV5
#> 759865 708938 243955 232970 222337 221722 181042 137149 126390
#> A1_ASV15 A1_ASV10 A1_ASV16 A1_ASV1 A1_ASV18 A1_ASV24 A1_ASV11 A1_ASV19 A1_ASV23
#> 104693 75971 53457 43959 15874 13138 543 207 11
#> A1_ASV25 A1_ASV22 A1_ASV27 A1_ASV28 A1_ASV14 A1_ASV21
#> 9 5 2 2 0 0
tax_table(ps1) %>%
as_tibble %>%
filter(taxon %in% names(top_taxa_sums)) %>%
print(n=Inf)
#> # A tibble: 24 x 8
#> taxon kingdom phylum class order family genus species
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… <NA>
#> 2 A1_AS… Bacteria Proteoba… Gammapr… Entero… Enterob… Escheri… <NA>
#> 3 A1_AS… Bacteria Verrucom… Verruco… Verruc… Akkerma… Akkerma… muciniphila
#> 4 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… uniformis
#> 5 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… faecichinchillae/faeci…
#> 6 A1_AS… Bacteria Firmicut… Bacilli Staphy… Staphyl… Staphyl… argenteus/aureus/capit…
#> 7 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Lachnoc… <NA>
#> 8 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Rosebur… <NA>
#> 9 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Bactero… Bactero… caccae
#> 10 A1_AS… Bacteria Proteoba… Gammapr… Chroma… Chromat… Thiofla… mobilis
#> 11 A1_AS… Bacteria Firmicut… Clostri… Oscill… Ruminoc… Faecali… cf./prausnitzii
#> 12 A1_AS… Bacteria Firmicut… Bacilli Bacill… Bacilla… Bacillus <NA>
#> 13 A1_AS… Bacteria Proteoba… Gammapr… Entero… Enterob… Escheri… coli
#> 14 A1_AS… Bacteria Actinoba… Corioba… Coriob… Corioba… Collins… aerofaciens
#> 15 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Marvinb… formatexigens
#> 16 A1_AS… Bacteria Firmicut… Clostri… Lachno… Lachnos… Agathob… <NA>
#> 17 A1_AS… Bacteria Bacteroi… Bactero… Bacter… Barnesi… Barnesi… intestinihominis
#> 18 A1_AS… Bacteria Firmicut… Bacilli Lactob… Lactoba… Lactoba… casei/curvatus/delbrue…
#> 19 A1_AS… Bacteria Firmicut… Bacilli Bacill… Bacilla… Bacillus altitudinis/amylolique…
#> 20 A1_AS… Bacteria Proteoba… Gammapr… Entero… Enterob… Salmone… <NA>
#> 21 A1_AS… Bacteria Firmicut… Bacilli Bacill… Bacilla… Bacillus acidicola/aerius/aerop…
#> 22 A1_AS… Bacteria Firmicut… Bacilli Lactob… Listeri… Listeria innocua/ivanovii/marth…
#> 23 A1_AS… Bacteria Firmicut… Bacilli Lactob… Enteroc… Enteroc… <NA>
#> 24 A1_AS… Bacteria Proteoba… Gammapr… Pseudo… Pseudom… Pseudom… aeruginosa/denitrifica…
inoc_taxa <- names(top_taxa_sums)[top_taxa_sums > 1000]
ps2 <- ps1 %>%
subset_samples(specimen_type %in% c("inoculum", "fecal")) %>%
prune_taxa(inoc_taxa, .)
This method might leave out ASVs that are very low abundance in the inoculum or poorly measured, but is ok for now. Note, I left in the Staph contaminant ASV6.
ps2 %>%
as_tibble %>%
group_by(specimen_id, specimen_type, taxon) %>%
summarize(across(abundance, sum), .groups = "drop") %>%
mutate_by(specimen_id, proportion = abundance %>% close_elts) %>%
mutate(taxon = factor(taxon, inoc_taxa)) %>%
ggplot(aes(taxon, proportion + 1e-5, color = specimen_type)) +
geom_quasirandom() +
scale_y_log10() +
coord_flip()
Roughly consistent with cross-sample contamination contributing roughly 1% of total reads.
From this figure, the following ASVs look like they are completely absent from the fecal specimens: 6, 12, 13, 24. (ASV6 is Staph).
taxa <- paste0("A1_ASV", c(6, 12, 13, 24))
tax_table(ps1)[taxa, 1:6]
#> Taxonomy Table: [4 taxa by 6 taxonomic ranks]:
#> kingdom phylum class order family
#> A1_ASV6 "Bacteria" "Firmicutes" "Bacilli" "Staphylococcales" "Staphylococcaceae"
#> A1_ASV12 "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales" "Ruminococcaceae"
#> A1_ASV13 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Bacillaceae"
#> A1_ASV24 "Bacteria" "Firmicutes" "Bacilli" "Bacillales" "Bacillaceae"
#> genus
#> A1_ASV6 "Staphylococcus"
#> A1_ASV12 "Faecalibacterium"
#> A1_ASV13 "Bacillus"
#> A1_ASV24 "Bacillus"
dist_tb %>%
filter(taxon1 %in% taxa, taxon2 %in% taxa)
#> # A tibble: 6 x 4
#> taxon1 taxon2 cor dist
#> <ord> <ord> <dbl> <dbl>
#> 1 A1_ASV6 A1_ASV12 0.811 0.261
#> 2 A1_ASV6 A1_ASV13 0.613 0.0395
#> 3 A1_ASV6 A1_ASV24 0.512 0.0711
#> 4 A1_ASV12 A1_ASV13 0.812 0.249
#> 5 A1_ASV12 A1_ASV24 0.465 0.253
#> 6 A1_ASV13 A1_ASV24 0.339 0.0593
custom_species %>%
filter(taxon %in% taxa) %>%
select(-sequence)
#> # A tibble: 4 x 3
#> genus species taxon
#> <chr> <chr> <chr>
#> 1 Staphylococcus aureus A1_ASV6
#> 2 Faecalibacterium prausnitzii A1_ASV12
#> 3 <NA> <NA> A1_ASV13
#> 4 <NA> <NA> A1_ASV24
I don’t think a Bacillus should be in the inoculum. But it is in the Zymo mock.
Hypothesis: The zymo mock taxa are being read into the inoculum samples.
tb <- ps1 %>%
prune_samples("A1_zymo_posc", .) %>%
as_tibble %>%
arrange(-abundance) %>%
select(taxon:abundance, rank_names(ps1)) %>%
print(n=Inf)
#> # A tibble: 24 x 10
#> taxon sample abundance kingdom phylum class order family genus species
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A1_AS… A1_zym… 18326 Bacter… Firmic… Bacil… Staph… Staphy… Staph… argenteus/aur…
#> 2 A1_AS… A1_zym… 17238 Bacter… Proteo… Gamma… Enter… Entero… Salmo… <NA>
#> 3 A1_AS… A1_zym… 15574 Bacter… Firmic… Bacil… Lacto… Lactob… Lacto… casei/curvatu…
#> 4 A1_AS… A1_zym… 15526 Bacter… Firmic… Bacil… Bacil… Bacill… Bacil… altitudinis/a…
#> 5 A1_AS… A1_zym… 12778 Bacter… Firmic… Bacil… Lacto… Lister… Liste… innocua/ivano…
#> 6 A1_AS… A1_zym… 11809 Bacter… Proteo… Gamma… Enter… Entero… Esche… <NA>
#> 7 A1_AS… A1_zym… 9191 Bacter… Firmic… Bacil… Lacto… Entero… Enter… <NA>
#> 8 A1_AS… A1_zym… 8502 Bacter… Proteo… Gamma… Pseud… Pseudo… Pseud… aeruginosa/de…
#> 9 A1_AS… A1_zym… 166 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… <NA>
#> 10 A1_AS… A1_zym… 49 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… uniformis
#> 11 A1_AS… A1_zym… 29 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… faecichinchil…
#> 12 A1_AS… A1_zym… 23 Bacter… Bacter… Bacte… Bacte… Bacter… Bacte… caccae
#> 13 A1_AS… A1_zym… 17 Bacter… Firmic… Clost… Lachn… Lachno… Lachn… <NA>
#> 14 A1_AS… A1_zym… 17 Bacter… Firmic… Clost… Lachn… Lachno… Roseb… <NA>
#> 15 A1_AS… A1_zym… 13 Bacter… Verruc… Verru… Verru… Akkerm… Akker… muciniphila
#> 16 A1_AS… A1_zym… 1 Bacter… Actino… Corio… Corio… Coriob… Colli… aerofaciens
#> 17 A1_AS… A1_zym… 0 Bacter… Proteo… Gamma… Chrom… Chroma… Thiof… mobilis
#> 18 A1_AS… A1_zym… 0 Bacter… Firmic… Clost… Oscil… Rumino… Faeca… cf./prausnitz…
#> 19 A1_AS… A1_zym… 0 Bacter… Firmic… Bacil… Bacil… Bacill… Bacil… <NA>
#> 20 A1_AS… A1_zym… 0 Bacter… Proteo… Gamma… Enter… Entero… Esche… coli
#> 21 A1_AS… A1_zym… 0 Bacter… Firmic… Clost… Lachn… Lachno… Marvi… formatexigens
#> 22 A1_AS… A1_zym… 0 Bacter… Firmic… Clost… Lachn… Lachno… Agath… <NA>
#> 23 A1_AS… A1_zym… 0 Bacter… Bacter… Bacte… Bacte… Barnes… Barne… intestinihomi…
#> 24 A1_AS… A1_zym… 0 Bacter… Firmic… Bacil… Bacil… Bacill… Bacil… acidicola/aer…
tb %>%
filter(genus == "Bacillus") %>%
select(taxon, abundance, genus)
#> # A tibble: 3 x 3
#> taxon abundance genus
#> <chr> <dbl> <chr>
#> 1 A1_ASV22 15526 Bacillus
#> 2 A1_ASV13 0 Bacillus
#> 3 A1_ASV24 0 Bacillus
Nope, since ASV13 and ASV24 are not found in the Zymo sample at all. Could have also guess this as they weren’t identified by the custom species assignment,
These are the taxa that seem like they should actually be in the inoculum by design, and that are found in the fecal samples as well, minus ASV18 since it is generally quite rare
# focal_taxa <- paste0("ASV", c(1, 2, 3, 4, 5, 7, 9, 10, 15, 16, 18))
focal_taxa <- paste0("A1_ASV", c(1, 2, 3, 4, 5, 7, 9, 10, 15, 16))
ps3 <- prune_taxa(focal_taxa, ps2)
ps3.pw <- ps3 %>%
transform_sample_counts(~ . + 1) %>%
pairwise_ratios(margin = "taxa", filter = TRUE) %>%
pairwise_ratios(margin = "samples", filter = FALSE) %>%
subset_samples(specimen_id.1 == specimen_id.2) %>%
subset_samples(extraction_protocol.1 == "1") %>%
subset_samples(extraction_protocol.2 == "2")
pwtb <- ps3.pw %>%
as_tibble %>%
rename(pair = taxon, bias = abundance) %>%
separate(pair, c("taxon.1", "taxon.2"), sep = ":", remove = FALSE)
pwtb %>%
ggplot(aes(pair, log(bias), color = specimen_type.1)) +
geom_quasirandom() +
coord_flip() +
facet_grid(. ~ extraction_batch.1)
Look at just the Bacteroides,
bacteroides <- tax_table(ps3) %>% as_tibble %>%
filter(genus == "Bacteroides") %>%
pull(taxon)
bacteroides_pairs <- crossing(
taxon1 = bacteroides,
taxon2 = bacteroides
) %>%
unite("pair", taxon1, taxon2, sep = ":") %>%
pull(pair)
pwtb %>%
filter(pair %in% bacteroides_pairs) %>%
ggplot(aes(pair, log(bias),
label = specimen_id.1, color = specimen_type.1)) +
geom_quasirandom() +
# geom_text(position = position_jitter(height = 0, width = 0.2)) +
coord_flip() +
facet_wrap(~paste("batch", extraction_batch.1), nrow = 1)
Note, there is clustering by specimen_id.
Summary: - Bias between the four Bacteroides taxa is small - ASV1 appears to have a lower differential bias in the inoculum than in the fecal samples except in specimens 4 and 25. Notably, it is much rarer in the inoculum samples.
Look at the remaining pairs,
pwtb %>%
filter(!(pair %in% bacteroides_pairs)) %>%
ggplot(aes(pair, log(bias),
label = specimen_id.1, color = specimen_type.1)) +
geom_quasirandom() +
# geom_text(position = position_jitter(height = 0, width = 0.2)) +
coord_flip() +
facet_wrap(~paste("batch", extraction_batch.1), nrow = 1)
Looks like ASV2 has a different bias in the inoc and fecal samples.
Look at just the E. coli pairs,
pwtb %>%
# filter(!(pair %in% bacteroides_pairs)) %>%
filter(str_detect(pair, "ASV2")) %>%
ggplot(aes(pair, log(bias),
label = specimen_id.1, color = specimen_type.1)) +
geom_quasirandom() +
# geom_text(position = position_jitter(height = 0, width = 0.2)) +
coord_flip() +
facet_wrap(~paste("batch", extraction_batch.1), nrow = 1) +
theme(
panel.spacing = unit(1, "cm")
)
Look at E. coli versus bacteroides
pwtb %>%
# filter(!(pair %in% bacteroides_pairs)) %>%
filter(
str_detect(pair, "ASV2"),
taxon.1 %in% bacteroides | taxon.2 %in% bacteroides
) %>%
ggplot(aes(pair, log(bias),
label = specimen_id.1, color = specimen_type.1)) +
geom_quasirandom(width = 0.2) +
# geom_text(position = position_jitter(height = 0, width = 0.2)) +
stat_summary(
fun.data = mean_se,
position = position_nudge(x = 0.3),
# shape = 124,
# fatten = 20
) +
coord_flip() +
facet_wrap(~paste("batch", extraction_batch.1), nrow = 1) +
theme(
panel.spacing = unit(1, "cm")
)
Clear pattern where E. coli has the same efficiency as Bacteroides in vivo but not in vitro.
To try to understand the outliers, it might be easier to look at samples rather than pairs of samples. Might be worth looking into whether the outliers could be due to cross-sample contamination.
pca <- ps3 %>%
transform_sample_counts(~clr(. + 1)) %>%
otu_table %>%
prcomp
x <- pca$x %>%
as_tibble(rownames = "sample") %>%
left_join(sample_data(ps3) %>% as_tibble, by = "sample")
# percent variance explained by each pc
pca_perc <- (pca$sdev^2) %>% close_elts %>% {round(. * 100, 1)} %>% print
#> [1] 58.0 22.7 7.9 5.0 3.1 1.6 1.2 0.4 0.2 0.0
ggplot(x, aes(PC1, PC2,
shape = extraction_batch,
# color = specimen_type)) +
color = extraction_protocol)) +
geom_point(size = 3) +
scale_color_brewer(type = "qual", palette = 2) +
scale_shape_manual(values = c(3, 6)) +
geom_mark_ellipse(aes(group = specimen_type, label = specimen_type),
color = "grey") +
coord_fixed(ratio = pca_perc[2] / pca_perc[1])
PC1 - specimen type, lesser extent: protocol PC2 - batch + type interaction
Next, try centering each specimen, weighting the two protocols equally, and redoing the PCA, so as to focus in on bias.